library(ProjectTemplate)
cwd <- getwd()
setwd("../")
load.project()
setwd(cwd)

Objective

Initial analysis of the logFC calculation results.

Loading Data

Input data is the tidy data frames generated by 2017-02-01-log-fold-change-df_dev.Rmd.

logFC_MgSeq <- readRDS("../data/logFC_MgSeq_df.rds")
logFC_DESeq2 <- readRDS("../data/logFC_DESeq2_df.rds")
logFC_edgeR <- readRDS("../data/logFC_edgeR_df.rds")

MgSeq logFC and SE Estimates

glimpse(logFC_MgSeq)
## Observations: 885,220
## Variables: 9
## $ pipe         <chr> "dada2", "dada2", "dada2", "dada2", "dada2", "dad...
## $ sampleID     <chr> "E01JH0004", "E01JH0004", "E01JH0004", "E01JH0004...
## $ T1           <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...
## $ T2           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ featureNames <chr> "Seq8", "Seq10", "Seq107", "Seq36", "Seq175", "Se...
## $ logFC        <dbl> -7.2958916, 3.9019979, -3.5145787, 2.8770745, -2....
## $ se           <dbl> 0.3071946, 0.1457152, 0.1444001, 0.1504701, 0.156...
## $ pvalues      <dbl> 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.00000...
## $ adjPvalues   <dbl> 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.00000...

Subset of data to make it easier to work with

logFC_MgSeq_sub <- logFC_MgSeq %>% filter(!is.na(logFC), !is.na(se)) %>% sample_frac(size = 0.25)

logFC estimate distributions:

For all five of the biological replicates most of the logFC estimates were between -2 and 2 log2.

logFC_MgSeq_sub %>% ggplot() + geom_boxplot(aes(x = pipe, y = logFC, color = sampleID)) +
      theme_bw()

logFC estimate distributions:

For all five of the biological replicates the standard error for most log fold-change estimates was less than 1. However, there were some log fold-change estimates with large standard error estimates for all biological replicates.

logFC_MgSeq_sub %>% ggplot() + 
      geom_boxplot(aes(x = pipe, y = se, color = sampleID)) + theme_bw() + scale_y_log10()

Log fold-change estimates with large standard errors are between -3 and and 2. Most of the features with outlier standard errors are from the QIIME pipeline. However, this is potentially biased as QIIME pipeline had the most features to begin with.

outlier_se <- logFC_MgSeq_sub %>% group_by(sampleID) %>% 
      mutate(u99 = quantile(se, 0.995)) %>% filter(se > u99) 
outlier_se %>% {summary(.$logFC)}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.1140 -0.7833 -0.2656 -0.2991  0.2008  2.8570
outlier_se %>% ggplot() + geom_bar(aes(x=pipe)) + theme_bw()

Features with log fold-change estimates with outlier standard error estimates.

outlier_se %>% group_by(pipe) %>% summarise(n_features = n_distinct(featureNames))
## # A tibble: 3 × 2
##     pipe n_features
##    <chr>      <int>
## 1  dada2         22
## 2 mothur         37
## 3  qiime         66

A number of the features with log fold change standard error estimates that are outliers, were outliers for multiple titration comparisons.

outlier_se %>% group_by(pipe, featureNames) %>% summarise(count = n()) %>% filter(count > 2)
## Source: local data frame [9 x 3]
## Groups: pipe [3]
## 
##     pipe featureNames count
##    <chr>        <chr> <int>
## 1  dada2       Seq278    11
## 2  dada2       Seq379     4
## 3  dada2       Seq393     3
## 4 mothur     Otu00003     4
## 5 mothur     Otu00076     8
## 6 mothur     Otu00102     5
## 7 mothur     Otu00160     3
## 8  qiime       315846     3
## 9  qiime       568118     4

Relationship between logFC estimates and Standard Errors

logFC_MgSeq_sub %>% 
      ggplot() + geom_point(aes(x = logFC, y = se, color = factor(T1)), alpha = 0.25) + 
      scale_y_log10() + facet_wrap(pipe~sampleID, scale = "free_y") + theme_bw()

The log fold change between adjacent titrations will be smaller as the samples are more similar, this effect increases with titration level. This is evident with narrowing of the point distributions. A naive expectation is that the log fold-change should not be greater than the difference in the sample titation values.

Excluding featues with outlier standard error estimates

logFC_MgSeq_sub %>% group_by(sampleID) %>% 
      mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99)  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") + 
      facet_grid(T1~T2) + theme_bw()

Separate plots for the different pipelines

logFC_MgSeq_sub %>% group_by(sampleID) %>% 
      mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99, pipe == "dada2")  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") + 
      facet_grid(T1~T2) + theme_bw()

logFC_MgSeq_sub %>% group_by(sampleID) %>% 
      mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99, pipe == "mothur")  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") + 
      facet_grid(T1~T2) + theme_bw()

logFC_MgSeq_sub %>% group_by(sampleID) %>% 
      mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99, pipe == "qiime")  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") + 
      facet_grid(T1~T2) + theme_bw()

### DESeq2 logFC and SE Estimates

glimpse(logFC_DESeq2)
## Observations: 885,220
## Variables: 11
## $ pipe           <chr> "dada2", "dada2", "dada2", "dada2", "dada2", "d...
## $ sampleID       <chr> "E01JH0004", "E01JH0004", "E01JH0004", "E01JH00...
## $ T1             <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,...
## $ T2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ featureNames   <chr> "Seq1", "Seq2", "Seq3", "Seq4", "Seq5", "Seq6",...
## $ baseMean       <dbl> 6273.479221, 4428.241244, 334.645867, 1907.1513...
## $ log2FoldChange <dbl> -4.41528734, 2.48603970, -1.97655120, 0.1301318...
## $ lfcSE          <dbl> 0.2844473, 0.2082474, 2.6937137, 0.1364468, 2.4...
## $ stat           <dbl> -15.5223404, 11.9379161, -0.7337644, 0.9537185,...
## $ pvalue         <dbl> 2.449515e-54, 7.508077e-33, 4.630923e-01, 3.402...
## $ padj           <dbl> 1.504002e-52, 1.152490e-31, 4.970956e-01, 3.730...

Subset of data to make it easier to work with

## renaming variables for consistency between differential abundance methods
logFC_DESeq2_sub <- logFC_DESeq2 %>% dplyr::rename(logFC = log2FoldChange, se = lfcSE) %>% 
      filter(!is.na(logFC), !is.na(se)) %>% sample_frac(size = 0.25)

For all five of the biological replicates most of the logFC estimates were between -4 and 4 log2. The DADA2 logFC estimates tend to be larger compare to the pipelines. This could be due to improper clustering of sequences. Where a sequence is incorretly clustered with another sequence that is highly abundant in one sample but only improperly sequences are cluster leading to inflated log fold-change estimates.

logFC_DESeq2_sub %>% ggplot() + geom_boxplot(aes(x = pipe, y = logFC, color = sampleID)) +
      theme_bw()

Distribution of standard error estimates

No extremely large standard error estimates like thoes observed for fitFeatureModel. Not sure what to make of the standard error estimates for mothur concentrated around 3.4.

logFC_DESeq2_sub %>% ggplot() + 
      geom_boxplot(aes(x = pipe, y = se, color = sampleID)) + theme_bw()

Relationship Between logFC and Standard Errror

DADA2

logFC_DESeq2_sub %>% group_by(sampleID) %>% filter(pipe == "dada2") %>% 
      # mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99)  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") +
      facet_grid(T1~T2) + theme_bw()

Mothur

logFC_DESeq2_sub %>% group_by(sampleID) %>% filter(pipe == "mothur") %>% 
      # mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99)  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") +
      facet_grid(T1~T2) + theme_bw()

QIIME

logFC_DESeq2_sub %>% group_by(sampleID) %>% filter(pipe == "qiime") %>% 
      # mutate(u99 = quantile(se, 0.995)) %>% filter(se < u99)  %>%
      ggplot() + geom_point(aes(x = logFC, y = se, color = sampleID, shape = pipe), alpha = 0.25) + 
      geom_vline(aes(xintercept = 1), color = "grey") + 
      geom_vline(aes(xintercept = -1), color = "grey") +
      facet_grid(T1~T2) + theme_bw()

edgeR logFC Estimates

edgeR did not calculate the standard error for the log fold-change

glimpse(logFC_edgeR)
## Observations: 885,220
## Variables: 25
## $ pipe      <chr> "dada2", "dada2", "dada2", "dada2", "dada2", "dada2"...
## $ sampleID  <chr> "E01JH0004", "E01JH0004", "E01JH0004", "E01JH0004", ...
## $ T1        <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...
## $ T2        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ OTUname   <chr> "Seq88", "Seq13", "Seq86", "Seq30", "Seq18", "Seq46"...
## $ Sequence  <chr> "Seq88", "Seq13", "Seq86", "Seq30", "Seq18", "Seq46"...
## $ logFC     <dbl> -12.022005, 11.418198, -11.780843, 10.997541, 11.400...
## $ logCPM    <dbl> 14.98450, 14.43363, 14.74363, 14.01374, 14.41588, 12...
## $ PValue    <dbl> 7.266076e-40, 1.900690e-37, 1.857515e-36, 1.697865e-...
## $ FDR       <dbl> 3.894617e-37, 5.093848e-35, 3.318761e-34, 2.275139e-...
## $ Rank1     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank2     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank3     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank4     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank5     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank6     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank7     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Rank8     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy6 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ taxonomy7 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

Subset of data to make it easier to work with

## renaming variables for consistency between differential abundance methods 
## Subsetting for features of interest
logFC_edgeR_sub <- logFC_edgeR %>% 
      select(pipe, sampleID, T1, T2, OTUname, logFC) %>% dplyr::rename(featureName = OTUname) %>%  
      filter(!is.na(logFC)) %>% sample_frac(size = 0.25)

For all five of the biological replicates most of the logFC estimates were between -2 and 2 log2. The DADA2 logFC estimates tend to be larger compare to the pipelines. This could be due to improper clustering of sequences. Where a sequence is incorretly clustered with another sequence that is highly abundant in one sample but only improperly sequences are cluster leading to inflated log fold-change estimates.

logFC_edgeR_sub %>% ggplot() + 
      geom_hline(aes(yintercept = -2)) + geom_hline(aes(yintercept = 2)) + 
      geom_boxplot(aes(x = pipe, y = logFC, color = sampleID)) +
      theme_bw()

logFC by titration comparisons

logFC_edgeR_sub %>% ggplot() + 
      geom_hline(aes(yintercept = -2)) + geom_hline(aes(yintercept = 2)) + 
      geom_boxplot(aes(x = pipe, y = logFC)) +
      theme_bw() + facet_grid(T1~T2)

Method Agreement

Can only compare differential abundance methods by pipeline as the features are not directly comparable, can look into comparing by taxonomic classification later.

MgSeq <- logFC_MgSeq %>% filter(!is.na(logFC), !is.na(logFC)) %>% 
      select(pipe, sampleID, T1, T2, featureNames, logFC) %>% 
      dplyr::rename(logFC_MgSeq = logFC)
DSeq <- logFC_DESeq2 %>% dplyr::rename(logFC = log2FoldChange) %>%
      select(pipe, sampleID, T1, T2, featureNames, logFC) %>% 
      dplyr::rename(logFC_DESeq2 = logFC)
edge <- logFC_edgeR %>% dplyr::rename(featureNames = OTUname) %>% 
      select(pipe, sampleID, T1, T2, featureNames, logFC) %>% 
      dplyr::rename(logFC_edgeR = logFC)
diff_abu_comp <- MgSeq %>% inner_join(DSeq) %>% inner_join(edge)
## Joining, by = c("pipe", "sampleID", "T1", "T2", "featureNames")
## Joining, by = c("pipe", "sampleID", "T1", "T2", "featureNames")

Only 66K features had log fold change estimates for all three differential abundance methods. No features were called for all

diff_abu_comp %>% ggplot() + 
      geom_abline(aes(intercept = 0, slope = 1), linetype = 2) + 
      geom_point(aes(x = logFC_MgSeq, y = logFC_DESeq2, fill = sampleID),
                 shape = 21, color = "white",  alpha = 0.25) +
      facet_grid(.~pipe)

diff_abu_comp %>% ggplot() + 
      geom_abline(aes(intercept = 0, slope = 1), linetype = 2) + 
      geom_point(aes(x = logFC_MgSeq, y = logFC_edgeR, fill = sampleID),
                 shape = 21, color = "white",  alpha = 0.25) +
      facet_grid(.~pipe)

diff_abu_comp %>% ggplot() + 
      geom_abline(aes(intercept = 0, slope = 1), linetype = 2) + 
      geom_point(aes(x = logFC_DESeq2, y = logFC_edgeR, fill = sampleID),
                 shape = 21, color = "white",  alpha = 0.25) +
      facet_grid(.~pipe)

## Downsample for more manageable dataset
diff_abu_comp %>% sample_frac(0.25) %>% 
      gather("diff_abu", "logFC", -pipe, -sampleID, -T1, -T2,
             -featureNames,factor_key = TRUE) %>% 
      mutate(group_set = paste(pipe, sampleID, T1, T2, featureNames)) %>% 
      ggplot() + 
      geom_point(aes(x = diff_abu, y = logFC)) + 
      geom_line(aes(x = as.numeric(diff_abu), y = logFC, group = group_set), alpha = 0.05) +
      facet_wrap(~pipe)

## only features with logFC > +/- 1
diff_abu_comp %>% filter(logFC_MgSeq < -1 | logFC_MgSeq > 1 | 
                         logFC_DESeq2 < -1 | logFC_DESeq2 > 1 |
                         logFC_edgeR < -1 | logFC_edgeR > 1) %>%  #sample_frac(0.25) %>% 
      gather("diff_abu", "logFC", -pipe, -sampleID, -T1, -T2,
             -featureNames,factor_key = TRUE) %>% 
      mutate(group_set = paste(pipe, sampleID, T1, T2, featureNames)) %>% 
      ggplot() + 
      geom_point(aes(x = diff_abu, y = logFC)) + 
      geom_line(aes(x = as.numeric(diff_abu), y = logFC, group = group_set), alpha = 0.05) +
      facet_wrap(~pipe)